In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from datetime import datetime
In [ ]:
pd.options.display.float_format = '{:.3f}'.format
In [ ]:
# VARS
dir_diversity_output = '../results_diversity'
dir_reads_fastq = '../data/control_sample/'
fastq_basename = 'POOL.fastq.gz'
dir_results_profiling = '../results_profiling'
pools_file="../data/EM_EVPools/samples_profiling.txt"
In [ ]:
today = datetime.today().strftime('%Y-%m-%d')
os.makedirs(f'{dir_diversity_output}/{today}', exist_ok=True)
In [ ]:
# GENERAL VARIABLES
POOL_list = !cat {pools_file}
POOL_list += ['ACIDOLA', 'BLACTIS']
In [ ]:
# Attributes 
cutoff_NA_ratio = 0.35
In [ ]:
# Create the pooled dataframe. We are going to separate mean and percentage to have some representation of two variables.

# The dataframe we are going to show has values of all samples using the raw dataframe, and not the cutoff one. However, to do the filtering we 
# are going to use the cutoff one, because for some species that have discordant read values, in many pools they are discarded, but not in all of them; so
# when doing the heatmap here, they appear on top, when in reality they should have been discarded for not appearing in many datasets

list_dfs_means_raw, list_dfs_per_raw = [], []
list_dfs_means_cutoff, list_dfs_per_raw_cutoff = [], []

list_selected_index = []


for POOL in POOL_list:
    df_POOL_cutoff = pd.read_csv(f'{dir_diversity_output}/{today}/{POOL}.diversity_cutoff.tsv', sep='\t', index_col='Unnamed: 0')
    df_POOL_cutoff.reset_index(inplace=True)
    df_POOL_cutoff = df_POOL_cutoff[['index', 'name', 'mean (%)', 'mean']].rename(columns = {'mean (%)': f'mean (%) {POOL}', 'mean': f'mean {POOL}'})
    df_POOL_cutoff['taxon - genus'] = df_POOL_cutoff['index'].astype(str) + ' - ' + df_POOL_cutoff['name']
    df_POOL_cutoff = df_POOL_cutoff.set_index('taxon - genus')

    df_POOL_raw = pd.read_csv(f'{dir_diversity_output}/{today}/{POOL}.diversity_raw.tsv', sep='\t', index_col='Unnamed: 0')
    df_POOL_raw.reset_index(inplace=True)
    df_POOL_raw = df_POOL_raw[['index', 'name', 'mean (%)', 'mean']].rename(columns = {'mean (%)': f'mean (%) {POOL}', 'mean': f'mean {POOL}'})
    df_POOL_raw['taxon - genus'] = df_POOL_raw['index'].astype(str) + ' - ' + df_POOL_raw['name']
    df_POOL_raw = df_POOL_raw.set_index('taxon - genus')

    list_dfs_means_raw.append(df_POOL_raw[f'mean {POOL}'])
    list_dfs_per_raw.append(df_POOL_raw[f'mean (%) {POOL}'])

    list_dfs_means_cutoff.append(df_POOL_cutoff[f'mean {POOL}'])
    list_dfs_per_raw_cutoff.append(df_POOL_cutoff[f'mean (%) {POOL}'])

    list_selected_index += df_POOL_cutoff.index.tolist()


selected_index = list(set(list_selected_index))
df_mean_raw, df_per_raw = pd.concat(list_dfs_means_raw, axis=1), pd.concat(list_dfs_per_raw, axis=1)
df_mean_raw = df_mean_raw.loc[selected_index]
df_per_raw = df_per_raw.loc[selected_index]

df_mean_cutoff, df_per_cutoff = pd.concat(list_dfs_means_cutoff, axis=1), pd.concat(list_dfs_per_raw_cutoff, axis=1)
df_mean_cutoff = df_mean_cutoff.loc[selected_index]
df_per_cutoff = df_per_cutoff.loc[selected_index]


# NA cut to keep only species that have only a set of values as NAs
nonNA_index = df_mean_cutoff[df_mean_cutoff.isna().sum(1) < int(cutoff_NA_ratio * len(POOL_list))].index

# Then we order by the median of the values (using mean skewed some species much present in a few samples)
df_mean_cutoff_nonNA = df_mean_cutoff.loc[nonNA_index]
df_mean_cutoff_nonNA = df_mean_cutoff_nonNA.assign(m=df_mean_cutoff_nonNA.median(axis=1)).sort_values('m', ascending=False).drop('m', axis=1)
df_mean_cutoff_nonNA.to_csv(f'{dir_diversity_output}/{today}/df_mean_cutoff_nonNA.tsv', sep='\t')


df_per_cutoff_nonNA = df_per_cutoff.loc[nonNA_index]
df_per_cutoff_nonNA = df_per_cutoff_nonNA.assign(m=df_per_cutoff_nonNA.median(axis=1)).sort_values('m', ascending=False).drop('m', axis=1)
df_per_cutoff_nonNA.to_csv(f'{dir_diversity_output}/{today}/df_per_cutoff_nonNA.tsv', sep='\t')


# Then do the same in raw, but only with cutoff samples
df_mean_raw_cutoffindex_nonNA = df_mean_raw.loc[df_mean_cutoff_nonNA.index.values]
df_mean_raw_cutoffindex_nonNA.to_csv(f'{dir_diversity_output}/{today}/df_mean_raw_cutoffindex_nonNA.tsv', sep='\t')


df_per_raw_cutoffindex_nonNA = df_per_raw.loc[df_per_cutoff_nonNA.index.values]
df_per_raw_cutoffindex_nonNA.to_csv(f'{dir_diversity_output}/{today}/df_per_raw_cutoffindex_nonNA.tsv', sep='\t')


# Then do the same in raw as with cutoff
df_mean_raw_nonNA = df_mean_raw
df_mean_raw_nonNA = df_mean_raw_nonNA.assign(m=df_mean_raw_nonNA.median(axis=1)).sort_values('m', ascending=False).drop('m', axis=1)
df_mean_raw_nonNA.to_csv(f'{dir_diversity_output}/{today}/df_mean_raw_nonNA.tsv', sep='\t')


df_per_raw_nonNA = df_per_raw
df_per_raw_nonNA = df_per_raw_nonNA.assign(m=df_per_raw_nonNA.median(axis=1)).sort_values('m', ascending=False).drop('m', axis=1)
df_per_raw_nonNA.to_csv(f'{dir_diversity_output}/{today}/df_per_raw_nonNA.tsv', sep='\t')
In [ ]:
df_mean_raw_nonNA
Out[ ]:
mean POOL1 mean POOL2 mean POOL3 mean POOL4 mean POOL5 mean POOL6 mean POOL7 mean POOL8 mean POOL9 mean POOL10 mean POOL11 mean POOL12 mean ACIDOLA mean BLACTIS
taxon - genus
9605 - Homo 1709929.998 1933932.908 1288302.626 1523916.538 1034480.997 570833.058 1050951.429 1048025.882 1049918.995 1260704.944 1109145.249 1321520.166 2305.690 288.340
36034 - Saccharomycodes 81123.066 59855.630 47695.527 62199.371 45876.058 19458.106 37345.758 30799.966 52090.312 61101.690 34952.457 65185.823 6.961 3.252
1912216 - Cutibacterium 11290.359 51032.855 18299.991 21386.945 16490.551 22427.767 23451.886 21068.794 32960.381 9649.674 17538.432 38091.691 480.752 73.982
37914 - Dietzia 10825.673 52825.661 17149.516 8769.884 20165.245 28191.640 45763.874 15913.459 46163.752 8122.847 6675.907 40384.093 NaN NaN
13687 - Sphingomonas 4688.349 29033.744 12101.603 6543.925 17594.253 25613.385 28941.491 20731.628 31299.074 6169.392 6310.884 29744.999 13.303 1.084
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2948645 - Burzaovirus NaN NaN 1.976 NaN NaN 283.320 0.866 NaN NaN NaN NaN NaN 1.856 NaN
1184396 - Mesotoga 1.705 3.427 2.964 NaN 1.079 1080.139 1.731 9.744 1.125 1.313 NaN NaN NaN 1.084
499228 - Tepidanaerobacter NaN 1.142 20.749 NaN 1.079 105.992 1.443 313.814 1.687 6.420 1.157 1.194 NaN NaN
2082587 - Parolsenella 2.558 1.142 56.812 1.023 1.079 1289.236 0.866 540.271 1.125 0.875 8.101 5.971 1.856 1.084
51196 - Syntrophobotulus NaN 1.142 NaN 1.023 74.964 428.590 0.866 112.220 1.125 NaN 5.208 1.194 NaN NaN

817 rows × 14 columns

In [ ]:
df_mean_raw_cutoffindex_nonNA
Out[ ]:
mean POOL1 mean POOL2 mean POOL3 mean POOL4 mean POOL5 mean POOL6 mean POOL7 mean POOL8 mean POOL9 mean POOL10 mean POOL11 mean POOL12 mean ACIDOLA mean BLACTIS
taxon - genus
9605 - Homo 1709929.998 1933932.908 1288302.626 1523916.538 1034480.997 570833.058 1050951.429 1048025.882 1049918.995 1260704.944 1109145.249 1321520.166 2305.690 288.340
1912216 - Cutibacterium 11290.359 51032.855 18299.991 21386.945 16490.551 22427.767 23451.886 21068.794 32960.381 9649.674 17538.432 38091.691 480.752 73.982
13687 - Sphingomonas 4688.349 29033.744 12101.603 6543.925 17594.253 25613.385 28941.491 20731.628 31299.074 6169.392 6310.884 29744.999 13.303 1.084
37914 - Dietzia 10825.673 52825.661 17149.516 8769.884 20165.245 28191.640 45763.874 15913.459 46163.752 8122.847 6675.907 40384.093 NaN NaN
469 - Acinetobacter 2772.345 11158.895 7052.537 9496.731 13154.365 23846.245 14577.205 80605.920 21943.776 15249.816 3400.060 15808.468 36.505 45.527
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
724 - Haemophilus 97.470 969.743 204.851 294.488 659.759 4195.865 1048.730 519.103 439.022 248.185 263.472 6373.544 0.928 NaN
482 - Neisseria 440.461 2410.079 156.767 343.314 584.614 1009.670 356.020 1045.263 827.701 194.054 362.226 15916.843 3.248 NaN
497 - Psychrobacter 193.519 255.476 673.505 414.124 3318.928 455.738 484.118 1028.127 1373.220 291.227 289.318 1429.082 0.928 NaN
84567 - Pedobacter 138.674 359.418 362.606 390.606 602.592 566.640 384.005 526.159 774.733 201.349 329.436 723.696 2.784 NaN
2040 - Aeromicrobium 57.970 664.009 265.779 348.000 356.306 590.467 604.715 283.575 578.238 221.193 263.472 807.292 NaN 1.084

102 rows × 14 columns

In [ ]:
df_mean_cutoff_nonNA
Out[ ]:
mean POOL1 mean POOL2 mean POOL3 mean POOL4 mean POOL5 mean POOL6 mean POOL7 mean POOL8 mean POOL9 mean POOL10 mean POOL11 mean POOL12 mean ACIDOLA mean BLACTIS
taxon - genus
9605 - Homo 1709929.998 1933932.908 1288302.626 1523916.538 1034480.997 570833.058 1050951.429 1048025.882 1049918.995 1260704.944 1109145.249 1321520.166 2305.690 288.340
1912216 - Cutibacterium 11290.359 51032.855 18299.991 21386.945 16490.551 22427.767 23451.886 21068.794 32960.381 9649.674 17538.432 38091.691 480.752 NaN
13687 - Sphingomonas 4688.349 29033.744 12101.603 6543.925 17594.253 25613.385 28941.491 20731.628 31299.074 6169.392 6310.884 29744.999 NaN NaN
37914 - Dietzia 10825.673 52825.661 17149.516 8769.884 20165.245 28191.640 45763.874 15913.459 46163.752 8122.847 6675.907 40384.093 NaN NaN
469 - Acinetobacter 2772.345 11158.895 7052.537 9496.731 13154.365 23846.245 14577.205 80605.920 21943.776 15249.816 3400.060 15808.468 NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
724 - Haemophilus NaN 969.743 204.851 294.488 659.759 4195.865 1048.730 519.103 439.022 248.185 263.472 6373.544 NaN NaN
482 - Neisseria 440.461 2410.079 156.767 343.314 584.614 1009.670 356.020 1045.263 827.701 194.054 362.226 15916.843 NaN NaN
497 - Psychrobacter 193.519 255.476 673.505 414.124 3318.928 455.738 484.118 1028.127 1373.220 291.227 289.318 1429.082 NaN NaN
84567 - Pedobacter 138.674 359.418 362.606 390.606 602.592 566.640 384.005 526.159 774.733 201.349 329.436 723.696 NaN NaN
2040 - Aeromicrobium NaN 664.009 265.779 348.000 356.306 590.467 604.715 283.575 578.238 221.193 263.472 807.292 NaN NaN

102 rows × 14 columns

In [ ]:
N = 25
fig, ax = plt.subplots(1, 1, figsize=(9, 7))
sns.heatmap(np.log10(df_mean_raw_nonNA.iloc[1:N, :]), yticklabels=True, annot=True, cmap='Blues')
plt.title('log10 mean counts')
plt.tight_layout()

fig, ax = plt.subplots(1, 1, figsize=(9, 7))
sns.heatmap(np.log10(df_mean_raw_cutoffindex_nonNA.iloc[1:N, :]), yticklabels=True, annot=True, cmap='Blues')
plt.title('log10 mean counts')
plt.tight_layout()


fig, ax = plt.subplots(1, 1, figsize=(9, 7))
sns.heatmap(np.log10(df_mean_cutoff_nonNA.iloc[1:N, :]), yticklabels=True, annot=True, cmap='Blues')
plt.title('log10 mean counts')
plt.tight_layout()
In [ ]:
df_mean_raw_cutoffindex_nonNA.iloc[1:N, :]
Out[ ]:
mean POOL1 mean POOL2 mean POOL3 mean POOL4 mean POOL5 mean POOL6 mean POOL7 mean POOL8 mean POOL9 mean POOL10 mean POOL11 mean POOL12 mean ACIDOLA mean BLACTIS
taxon - genus
1912216 - Cutibacterium 11290.359 51032.855 18299.991 21386.945 16490.551 22427.767 23451.886 21068.794 32960.381 9649.674 17538.432 38091.691 480.752 73.982
13687 - Sphingomonas 4688.349 29033.744 12101.603 6543.925 17594.253 25613.385 28941.491 20731.628 31299.074 6169.392 6310.884 29744.999 13.303 1.084
37914 - Dietzia 10825.673 52825.661 17149.516 8769.884 20165.245 28191.640 45763.874 15913.459 46163.752 8122.847 6675.907 40384.093 NaN NaN
469 - Acinetobacter 2772.345 11158.895 7052.537 9496.731 13154.365 23846.245 14577.205 80605.920 21943.776 15249.816 3400.060 15808.468 36.505 45.527
12916 - Acidovorax 3195.614 11895.625 13657.993 4548.717 13717.947 19686.625 30035.517 10773.411 33610.055 5720.076 4023.829 25862.296 3.094 NaN
1386 - Bacillus 2112.507 15043.006 7450.383 4710.447 7874.768 48486.421 12120.551 15452.481 17655.743 4005.104 4657.627 20831.152 1099.170 76.963
106589 - Cupriavidus 2594.172 14180.917 12334.036 3906.569 11694.717 20391.026 26735.047 9139.243 30993.361 5864.085 3695.164 23304.578 9.745 NaN
1716 - Corynebacterium 3982.049 12378.212 4941.618 5210.548 12241.310 13249.976 19116.029 7387.899 19589.483 4648.328 21728.329 9789.607 41.764 5.781
1678 - Bifidobacterium 715.891 13496.158 10190.759 347.404 1395.201 405656.412 12225.569 9068.686 9712.665 333.832 189.792 1440.626 82.832 1746265.778
286 - Pseudomonas 1579.691 7116.301 8640.791 2966.866 20818.262 31180.146 5227.350 215000.525 113899.851 48460.663 5460.869 8496.268 46.405 2.891
196013 - Caldimonas 2271.286 8988.396 7712.293 2637.356 8077.819 10798.434 15713.642 5392.375 17120.724 3553.162 2544.259 14416.905 1.856 NaN
1827 - Rhodococcus 2528.316 10710.003 5306.694 2767.728 8542.706 9914.901 15098.469 5647.391 15276.607 2576.179 3059.533 18296.921 20.418 NaN
57493 - Kocuria 1031.744 15190.352 2149.453 9120.696 54524.200 31542.960 5599.527 6025.127 8896.870 2415.975 2836.470 7362.656 19.799 1.445
909656 - Phocaeicola 197.497 6149.319 21086.145 540.917 910.719 919244.317 15336.056 179271.849 11252.007 165.238 344.481 1222.084 1.856 NaN
407 - Methylobacterium 1188.605 5606.004 5145.645 2551.464 6594.710 14681.088 7832.803 5201.617 11522.001 2332.809 2805.802 14268.225 2.475 1.084
1301 - Streptococcus 1968.220 69050.196 2013.929 2781.277 4284.027 95039.999 5335.541 10572.910 9364.673 3412.874 4222.012 54646.247 68.679 1.445
265 - Paracoccus 903.442 4292.073 774.860 7220.583 10034.174 7797.582 1207.410 60205.409 2228.858 4524.892 10290.737 997.471 3.403 2.710
1883 - Streptomyces 1273.358 8505.714 3106.110 3501.478 4198.816 7967.039 5357.828 3827.253 7735.333 2286.922 2236.425 7807.801 70.535 7.227
1279 - Staphylococcus 2020.010 5626.945 1953.988 3297.654 2980.779 13478.350 4838.007 4716.617 7185.501 1506.838 2477.426 5648.654 146.174 9.033
816 - Bacteroides 227.335 3974.156 9193.922 306.758 823.350 436335.983 5810.066 208378.933 4382.530 221.047 379.874 2475.217 1.237 NaN
572511 - Blautia 468.309 6134.470 29231.111 737.754 1630.521 964184.112 20585.045 43023.591 20057.566 158.745 289.028 617.411 2.784 1.084
283 - Comamonas 1886.167 4940.377 4192.446 782.234 2140.800 3336.158 6658.211 4255.640 5241.824 2053.327 1709.288 3851.952 2.166 NaN
1269 - Micrococcus 1110.814 8954.701 2225.037 6058.480 9975.120 4722.578 2774.230 2128.828 2855.938 10248.032 2784.971 5304.719 10.828 1.084
57494 - Nesterenkonia 1232.153 4312.252 3382.017 1572.989 4056.707 5007.053 9124.963 3493.616 8785.685 1613.714 1117.537 9788.213 NaN NaN
In [ ]:
N = 100

fig, ax = plt.subplots(1, 1, figsize=(9, 22))
sns.heatmap(np.log10(df_mean_raw_cutoffindex_nonNA.iloc[1:N, :]), yticklabels=True, annot=True, cmap='Blues')
plt.title('log10 mean counts')
plt.tight_layout()
plt.savefig(f'{dir_diversity_output}/{today}/heatmap_mean_nonNA_annot.png', dpi=300)



fig, ax = plt.subplots(1, 1, figsize=(9, 22))
sns.heatmap(np.log10(df_mean_raw_cutoffindex_nonNA.iloc[1:N, :]), yticklabels=True, annot=False, cmap='Blues')
plt.title('log10 mean counts')
plt.tight_layout()
plt.savefig(f'{dir_diversity_output}/{today}/heatmap_mean_nonNA.png', dpi=300)
In [ ]:
N = 100

fig, ax = plt.subplots(1, 1, figsize=(9, 22))
sns.heatmap(np.log10(df_per_raw_cutoffindex_nonNA.iloc[1:N, :]), yticklabels=True, annot=True, cmap='Blues')
plt.title('log10 percentage counts')
plt.tight_layout()
plt.savefig(f'{dir_diversity_output}/{today}/heatmap_per_nonNA_annot.png', dpi=300)



fig, ax = plt.subplots(1, 1, figsize=(9, 22))
sns.heatmap(np.log10(df_per_raw_cutoffindex_nonNA.iloc[1:N, :]), yticklabels=True, annot=False, cmap='Blues')
plt.title('log10 percentage counts')
plt.tight_layout()
plt.savefig(f'{dir_diversity_output}/{today}/heatmap_per_nonNA.png', dpi=300)
In [ ]:
## Try a quick wilcoxon test
from scipy.stats import mannwhitneyu, ttest_ind

For this part we can use the follwoign databases to get some insights https://mbodymap.microbiome.cloud/ | https://www.microbiomeatlas.org/

In [ ]:
# RR [POOL 1-4] vs HC [POOL 8-12]

list_pvals_mannwhitney = []
list_pvals_ttest = []
L2FC = []

for row in range(len(df_mean_raw_cutoffindex_nonNA)):
    condition_vals = df_mean_raw_cutoffindex_nonNA.iloc[row, :4].values
    reference_vals = df_mean_raw_cutoffindex_nonNA.iloc[row, 8:12].values

    res = mannwhitneyu(condition_vals, reference_vals)
    list_pvals_mannwhitney.append(res.pvalue)

    res = ttest_ind(condition_vals, reference_vals)
    list_pvals_ttest.append(res.pvalue)

    L2FC.append(np.log2(condition_vals.mean() / reference_vals.mean()))




df_pval = df_mean_raw_cutoffindex_nonNA.iloc[:, [0,1,2,3,8,9,10,11]]
df_pval['log2FC'] = L2FC
df_pval['pval_ttest'] = list_pvals_ttest
df_pval['pval_MW'] = list_pvals_mannwhitney


df_pval = df_pval.sort_values(by=['pval_MW', 'pval_ttest'])
display(df_pval.iloc[:15])

df_pval_pos = df_pval[(df_pval['pval_MW'] < 0.05)]


fig, ax = plt.subplots(1, 1, figsize=(9, 2))
sns.heatmap(np.log10(df_pval_pos.iloc[:, :-3]), yticklabels=True, annot=True, cmap='Blues')
plt.title('log10 percentage counts')
plt.tight_layout()
/tmp/ipykernel_2735545/465228851.py:23: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pval['log2FC'] = L2FC
/tmp/ipykernel_2735545/465228851.py:24: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pval['pval_ttest'] = list_pvals_ttest
/tmp/ipykernel_2735545/465228851.py:25: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pval['pval_MW'] = list_pvals_mannwhitney
mean POOL1 mean POOL2 mean POOL3 mean POOL4 mean POOL9 mean POOL10 mean POOL11 mean POOL12 log2FC pval_ttest pval_MW
taxon - genus
53457 - Janibacter 242.680 101.657 476.888 486.383 802.670 538.391 526.558 604.673 -0.919 0.043 0.029
9605 - Homo 1709929.998 1933932.908 1288302.626 1523916.538 1049918.995 1260704.944 1109145.249 1321520.166 0.445 0.030 0.057
125216 - Roseomonas 174.337 458.600 707.427 485.701 2409.323 572.095 1047.715 1634.288 -1.633 0.058 0.057
1696 - Brevibacterium 334.182 519.995 905.362 1117.112 1698.713 1061.900 2918.636 1184.068 -1.255 0.073 0.057
237 - Flavobacterium 229.324 829.250 590.182 388.220 894.731 1398.285 391.157 1552.882 -1.057 0.110 0.114
165696 - Novosphingobium 210.782 770.711 370.016 277.105 1370.783 406.857 525.111 1396.340 -1.184 0.129 0.114
165695 - Sphingobium 759.582 2048.377 1756.712 506.663 3202.806 2728.723 818.769 2891.203 -0.927 0.133 0.114
29580 - Janthinobacterium 230.745 317.917 601.050 201.438 5777.125 635.856 271.187 736.236 -2.457 0.292 0.114
286 - Pseudomonas 1579.691 7116.301 8640.791 2966.866 113899.851 48460.663 5460.869 8496.268 -3.118 0.174 0.200
469 - Acinetobacter 2772.345 11158.895 7052.537 9496.731 21943.776 15249.816 3400.060 15808.468 -0.888 0.181 0.200
1578 - Lactobacillus 673.195 4523.563 821.709 1159.547 3442.707 1517.125 16327.927 2874.484 -1.751 0.279 0.200
89966 - Hymenobacter 248.647 3774.268 603.355 408.670 5480.882 719.606 727.344 2339.872 -0.880 0.479 0.200
1716 - Corynebacterium 3982.049 12378.212 4941.618 5210.548 19589.483 4648.328 21728.329 9789.607 -1.072 0.154 0.343
1822464 - Paraburkholderia 177.747 640.784 1001.859 388.902 1729.088 584.789 411.217 1560.048 -0.956 0.220 0.343
497 - Psychrobacter 193.519 255.476 673.505 414.124 1373.220 291.227 289.318 1429.082 -1.138 0.221 0.343
In [ ]:
# SP vs HC

list_pvals_mannwhitney = []
list_pvals_ttest = []
L2FC = []

for row in range(len(df_mean_raw_cutoffindex_nonNA)):
    condition_vals = df_per_raw_cutoffindex_nonNA.iloc[row, 4:8].values
    reference_vals = df_per_raw_cutoffindex_nonNA.iloc[row, 8:12].values

    res = mannwhitneyu(condition_vals, reference_vals)
    list_pvals_mannwhitney.append(res.pvalue)

    res = ttest_ind(condition_vals, reference_vals)
    list_pvals_ttest.append(res.pvalue)

    L2FC.append(np.log2(condition_vals.mean() / reference_vals.mean()))




df_pval = df_mean_raw_cutoffindex_nonNA.iloc[:, [4,5,6,7,8,9,10,11]]
df_pval['log2FC'] = L2FC
df_pval['pval_ttest'] = list_pvals_ttest
df_pval['pval_MW'] = list_pvals_mannwhitney


df_pval = df_pval.sort_values(by=['pval_MW', 'pval_ttest'])
display(df_pval.iloc[:15])

df_pval_pos = df_pval[(df_pval['pval_MW'] < 0.05)]


fig, ax = plt.subplots(1, 1, figsize=(9, 3))
sns.heatmap(np.log10(df_pval_pos.iloc[:, :-3]), yticklabels=True, annot=True, cmap='Blues')
plt.title('log10 percentage counts')
plt.tight_layout()
/tmp/ipykernel_2735545/1233428652.py:23: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pval['log2FC'] = L2FC
/tmp/ipykernel_2735545/1233428652.py:24: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pval['pval_ttest'] = list_pvals_ttest
/tmp/ipykernel_2735545/1233428652.py:25: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pval['pval_MW'] = list_pvals_mannwhitney
mean POOL5 mean POOL6 mean POOL7 mean POOL8 mean POOL9 mean POOL10 mean POOL11 mean POOL12 log2FC pval_ttest pval_MW
taxon - genus
338 - Xanthomonas 1603.645 1703.963 2571.048 974.453 674.235 610.614 381.610 458.182 1.690 0.012 0.029
59732 - Chryseobacterium 2593.373 3726.192 2432.708 3218.189 1947.708 557.213 499.169 2170.791 1.210 0.019 0.029
75 - Caulobacter 982.627 1434.362 1276.652 4758.112 902.512 303.119 298.865 913.577 1.805 0.146 0.029
9605 - Homo 1034480.997 570833.058 1050951.429 1048025.882 1049918.995 1260704.944 1109145.249 1321520.166 -0.356 0.102 0.057
169133 - Citricoccus 1091.029 1092.269 1554.486 587.645 2314.825 351.048 270.416 1788.143 4.077 0.279 0.057
572511 - Blautia 1630.521 964184.112 20585.045 43023.591 20057.566 158.745 289.028 617.411 5.607 0.326 0.057
52972 - Polaromonas 1054.176 627.578 1019.014 611.836 713.985 251.541 256.143 582.779 0.876 0.066 0.114
517 - Bordetella 792.250 4083.736 1213.180 4430.690 945.261 511.253 283.145 1150.032 1.864 0.096 0.114
1839 - Nocardioides 3947.496 4966.548 4746.983 2615.424 3319.241 1035.856 1979.511 3760.594 0.689 0.108 0.114
1485 - Clostridium 1169.229 135619.091 6065.036 218546.808 3276.679 609.885 620.876 1410.372 5.932 0.144 0.114
816 - Bacteroides 823.350 436335.983 5810.066 208378.933 4382.530 221.047 379.874 2475.217 6.448 0.170 0.114
909656 - Phocaeicola 910.719 919244.317 15336.056 179271.849 11252.007 165.238 344.481 1222.084 6.424 0.252 0.114
665874 - Limnohabitans 747.487 1854.432 2146.507 289.287 3800.168 1221.228 1971.217 3390.386 -1.043 0.124 0.200
165695 - Sphingobium 4139.761 3742.727 2614.973 46364.752 3202.806 2728.723 818.769 2891.203 2.560 0.314 0.200
1678 - Bifidobacterium 1395.201 405656.412 12225.569 9068.686 9712.665 333.832 189.792 1440.626 5.197 0.336 0.200
In [ ]:
# RR vs SP

list_pvals_mannwhitney = []
list_pvals_ttest = []
L2FC = []

for row in range(len(df_mean_raw_cutoffindex_nonNA)):
    condition_vals = df_per_raw_cutoffindex_nonNA.iloc[row, :4].values
    reference_vals = df_per_raw_cutoffindex_nonNA.iloc[row, 4:8].values

    res = mannwhitneyu(condition_vals, reference_vals)
    list_pvals_mannwhitney.append(res.pvalue)

    res = ttest_ind(condition_vals, reference_vals)
    list_pvals_ttest.append(res.pvalue)

    L2FC.append(np.log2(condition_vals.mean() / reference_vals.mean()))




df_pval = df_mean_raw_cutoffindex_nonNA.iloc[:, [0,1,2,3,4,5,6,7]]
df_pval['log2FC'] = L2FC
df_pval['pval_ttest'] = list_pvals_ttest
df_pval['pval_MW'] = list_pvals_mannwhitney


df_pval = df_pval.sort_values(by=['pval_MW', 'pval_ttest'])
display(df_pval.iloc[:15])

df_pval_pos = df_pval[(df_pval['pval_MW'] < 0.05)]


fig, ax = plt.subplots(1, 1, figsize=(9, 5))
sns.heatmap(np.log10(df_pval_pos.iloc[:, :-3]), yticklabels=True, annot=True, cmap='Blues')
plt.title('log10 percentage counts')
plt.tight_layout()
/tmp/ipykernel_2735545/248738686.py:23: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pval['log2FC'] = L2FC
/tmp/ipykernel_2735545/248738686.py:24: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pval['pval_ttest'] = list_pvals_ttest
/tmp/ipykernel_2735545/248738686.py:25: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pval['pval_MW'] = list_pvals_mannwhitney
mean POOL1 mean POOL2 mean POOL3 mean POOL4 mean POOL5 mean POOL6 mean POOL7 mean POOL8 log2FC pval_ttest pval_MW
taxon - genus
9605 - Homo 1709929.998 1933932.908 1288302.626 1523916.538 1034480.997 570833.058 1050951.429 1048025.882 0.801 0.009 0.029
165696 - Novosphingobium 210.782 770.711 370.016 277.105 1103.073 1689.090 1261.506 2477.333 -2.004 0.010 0.029
1763 - Mycobacterium 653.232 1037.895 795.609 517.911 1171.656 1748.223 1818.039 1193.434 -0.981 0.012 0.029
469 - Acinetobacter 2772.345 11158.895 7052.537 9496.731 13154.365 23846.245 14577.205 80605.920 -2.117 0.166 0.029
29404 - Microlunatus 241.543 678.763 574.291 535.805 1914.559 23500.181 1537.464 1775.031 -3.823 0.266 0.029
165695 - Sphingobium 759.582 2048.377 1756.712 506.663 4139.761 3742.727 2614.973 46364.752 -3.487 0.273 0.029
52972 - Polaromonas 100.382 626.792 341.363 203.483 1054.176 627.578 1019.014 611.836 -1.381 0.022 0.057
33882 - Microbacterium 659.412 2986.042 1860.208 1189.967 3473.980 4616.946 4375.672 2507.824 -1.161 0.025 0.057
41275 - Brevundimonas 708.218 3265.886 1339.765 2465.827 2999.655 4858.462 4968.558 3455.313 -1.065 0.031 0.057
84567 - Pedobacter 138.674 359.418 362.606 390.606 602.592 566.640 384.005 526.159 -0.733 0.034 0.057
407 - Methylobacterium 1188.605 5606.004 5145.645 2551.464 6594.710 14681.088 7832.803 5201.617 -1.243 0.080 0.057
1866885 - Mycolicibacterium 920.492 924.340 1120.176 379.102 1922.648 2701.937 1145.741 1002.424 -1.018 0.089 0.057
75 - Caulobacter 138.532 1170.773 482.816 353.454 982.627 1434.362 1276.652 4758.112 -1.978 0.135 0.057
44249 - Paenibacillus 218.241 818.209 1700.724 221.548 897.416 34718.108 1746.922 3942.162 -2.427 0.221 0.057
125216 - Roseomonas 174.337 458.600 707.427 485.701 5673.565 2165.477 703.673 27898.531 -4.319 0.222 0.057
In [ ]:
# SEX

list_pvals_mannwhitney = []
list_pvals_ttest = []
L2FC = []

for row in range(len(df_mean_raw_cutoffindex_nonNA)):
    condition_vals = df_per_raw_cutoffindex_nonNA.iloc[row, [2,3, 6,7, 10,11]].values  #FEMALE
    reference_vals = df_per_raw_cutoffindex_nonNA.iloc[row, [0,1, 4,5, 8,9]].values  #MALE

    res = mannwhitneyu(condition_vals, reference_vals)
    list_pvals_mannwhitney.append(res.pvalue)

    res = ttest_ind(condition_vals, reference_vals)
    list_pvals_ttest.append(res.pvalue)

    L2FC.append(np.log2(condition_vals.mean() / reference_vals.mean()))




df_pval = df_mean_raw_cutoffindex_nonNA.iloc[:, [0,1,4,5,8,9,2,3,6,7,10,11]]
df_pval['log2FC'] = L2FC
df_pval['pval_ttest'] = list_pvals_ttest
df_pval['pval_MW'] = list_pvals_mannwhitney


df_pval = df_pval.sort_values(by=['pval_MW', 'pval_ttest'])
display(df_pval.iloc[:15])

df_pval_pos = df_pval[(df_pval['pval_MW'] < 0.05)]

try:
    fig, ax = plt.subplots(1, 1, figsize=(9, 5))
    sns.heatmap(np.log10(df_pval_pos.iloc[:, :-3]), yticklabels=True, annot=True, cmap='Blues')
    plt.title('log10 percentage counts')
    plt.tight_layout()
except:
    print('NO SIGNIFICANT SAMPLES')
/tmp/ipykernel_2735545/3247866785.py:23: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pval['log2FC'] = L2FC
mean POOL1 mean POOL2 mean POOL5 mean POOL6 mean POOL9 mean POOL10 mean POOL3 mean POOL4 mean POOL7 mean POOL8 mean POOL11 mean POOL12 log2FC pval_ttest pval_MW
taxon - genus
57495 - Dermacoccus 305.481 5917.258 13471.751 6928.560 4459.404 2394.527 423.370 3007.000 1343.009 1635.176 3198.117 2476.212 -1.470 0.091 0.180
2034 - Curtobacterium 287.578 1146.786 714.049 1274.796 1256.223 301.441 400.480 268.243 562.881 382.020 288.160 751.960 -0.908 0.085 0.240
2053 - Gordonia 695.644 2967.767 3917.834 4503.012 2424.041 483.676 683.715 935.358 1951.547 1333.037 644.310 2131.382 -0.965 0.122 0.240
1269 - Micrococcus 1110.814 8954.701 9975.120 4722.578 2855.938 10248.032 2225.037 6058.480 2774.230 2128.828 2784.971 5304.719 -0.832 0.145 0.240
149698 - Massilia 353.789 6106.866 2303.672 1937.320 86421.476 4726.679 1326.179 517.058 1073.470 10077.409 707.381 1935.530 -2.703 0.329 0.240
42255 - Rubrobacter 624.033 2149.939 1150.892 3435.797 2733.972 256.283 770.414 215.242 1529.386 963.869 559.830 1344.093 -0.943 0.162 0.310
28067 - Rubrivivax 1881.478 4131.116 1832.583 4063.519 4614.932 1775.669 2935.429 718.326 4460.710 1477.345 1070.475 2922.551 -0.430 0.350 0.310
57493 - Kocuria 1031.744 15190.352 54524.200 31542.960 8896.870 2415.975 2149.453 9120.696 5599.527 6025.127 2836.470 7362.656 -1.779 0.146 0.394
316612 - Methylibium 216.110 2643.853 1197.633 2133.420 2423.573 375.779 928.992 327.209 2010.186 434.434 336.476 1528.600 -0.692 0.296 0.394
391952 - Aquincola 951.821 4937.236 1132.556 2768.218 2224.077 876.308 3005.578 545.008 1318.198 520.111 832.077 2486.363 -0.566 0.387 0.394
338 - Xanthomonas 374.036 2138.231 1603.645 1703.963 674.235 610.614 512.292 339.224 2571.048 974.453 381.610 458.182 -0.440 0.513 0.394
2897311 - Fulvia 303.918 989.160 550.638 830.466 1007.416 267.445 471.783 289.887 612.793 398.651 440.920 711.754 -0.433 0.279 0.485
590 - Salmonella 131.286 937.761 675.219 1392.918 2223.327 690.424 378.414 331.981 797.150 13920.034 201.365 992.396 -0.580 0.357 0.485
165695 - Sphingobium 759.582 2048.377 4139.761 3742.727 3202.806 2728.723 1756.712 506.663 2614.973 46364.752 818.769 2891.203 1.725 0.412 0.485
561 - Escherichia 234.723 2897.806 1398.257 3607.204 3752.919 1138.062 655.062 637.035 1080.178 5030.515 411.988 1640.060 -0.463 0.535 0.485
NO SIGNIFICANT SAMPLES